import os

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

from SparkSessionBase import SparkSessionBase


# 继承 SparkSessionBase 类
class TextRandJob(SparkSessionBase):
    SPARK_URL = "yarn"
    SPARK_APP_NAME = '1234567'
    ENABLE_HIVE_SUPPORT = True

    def __init__(self):
        self.spark = self._create_spark_session()
        self.spark.sparkContext.setLogLevel("ERROR")

    def run(self):
        # 从 Hive 表中读取数据
        business_df = self.spark.sql("SELECT * FROM review")

        # 提取评分最高的前 100 个饭店
        top_restaurants = business_df.select(
            "review_id",
            "rev_user_id",
            "rev_business_id",
            "rev_stars",
            "rev_useful",
            "rev_funny"
        ).limit(100)

        # 显示结果
        top_restaurants.show(truncate=False)


if __name__ == '__main__':
    TextRandJob().run()